We use a combination of the Sequence Description and Gene Ontology (GO) terms to fetch cell-wall-related gene candidates
# Annotation
annotation <- read_tsv(here("data/b2g/blast2go_20190117_export.txt.gz"),show_col_types=FALSE) %>%
mutate(`Sequence Name`=sub("\\.p\\d+$","",`Sequence Name`))
# GO dictionary
go_annot <- annotation %>% select(`Annotation GO ID`,`Annotation GO Term`,`Annotation GO Category`) %>%
rename(ID=`Annotation GO ID`,Term=`Annotation GO Term`,Category=`Annotation GO Category`) %>%
filter(!is.na(ID))
go_annot <- tibble(ID=unlist(str_split(go_annot$ID,"\\|"),use.names=FALSE),
Term=unlist(str_split(go_annot$Term,"\\|"),use.names=FALSE),
Category=unlist(str_split(go_annot$Category,"\\|"),use.names=FALSE)) %>% distinct()
# Expression
expression <- read_tsv(here("data/analysis/salmon/variance-stabilised_model-aware_gene-expression_data.tsv"),
show_col_types=FALSE)
# Reverse sample swap
expression %<>% rename(B2_2_24hrs_D="B2_2_12hrs_D",B2_2_12hrs_D="B2_2_24hrs_D")
# Gene of interest
goi <- unlist(annotation[grepl("cell wall",annotation$`Sequence Description`) |
grepl("cell wall",annotation$`Annotation GO Term`),"Sequence Name"],use.names=FALSE)
# replace (\\.p\\d+$) .p followed by any number of digits. The $ the sign that indicates the end of the text
goi <- sub("\\.p\\d+$","",goi)
# sanity
stopifnot(length(goi)==length(unique(goi)))
message(sprintf("There are %s candidates",length(goi)))
# Expression
vst <- expression %>% filter(rowname %in% goi) %>% column_to_rownames() %>% as.matrix()
sel <- rowSums(vst) > 0
vst <- vst[sel,]
vst %<>% as.data.frame() %>% rownames_to_column(var = "ID")
# Export
# write_tsv(vst, here("data/analysis/cell-wall/goi-vst-expression_with-geneID.tsv"))Next, we look at the correlation between cell wall thickness and change in expression (mean of 4 replicates). We keep only the time points that are present in both datasets:
1h, 4hs, 12hs,
24hs, 72hs, 120hs
#cell wall
cw <- read_tsv(here("doc/Cell-wall-measurement.tsv"))
cw %<>% filter(!time_in_hours %in% c(48,240))
cw_ctrl <- cw %>% filter(treatment %in% "control")
cw_cold <- cw %>% filter(treatment %in% "cold")
colnames(vst) %<>% gsub(pattern = "B2_2_", replacement = "")
colnames(vst) %<>% gsub(pattern = "60min", replacement = "1hrs_")
#calculating the average expression of reps
vst_avg <- vst %>% pivot_longer(cols = -ID, names_to = c("time", "rep"), names_sep = "_") %>%
group_by(time, ID) %>%
summarize(meanExp = mean(value)) %>%
pivot_wider(names_from = time, values_from = meanExp)
#reorder the columns based on the cw data
vst_avg <- vst_avg[, c(1, match(paste0(cw_ctrl$time_in_hours, "hrs"), colnames(vst_avg)))]
#correlation
vst_avg %<>%
rowwise() %>%
mutate(ctrl_cor = cor(c_across(`1hrs`:`120hrs`),
cw_ctrl$width_in_um %>% as.vector(),
method = "pearson"),
cold_cor = cor(c_across(`1hrs`:`120hrs`),
cw_cold$width_in_um %>% as.vector(),
method = "pearson"))Below we look at the distribution of the correlation values for cell wall genes.
# Reshape data for ggplot
df <- vst_avg %>%
select(ctrl_cor,cold_cor) %>%
pivot_longer(cols = everything()) %>%
mutate(Color = ifelse(value >= 0.7, "red",
ifelse(value <= -0.7, "blue", "gray")))
# Create the plot
ggplot(data = df, aes(x = name, y = value)) +
geom_violin() +
geom_jitter(aes(color = Color), width = 0.2) +
scale_color_manual(values = c("blue", "grey", "red")) +
theme(legend.position = "none", text = element_text(size = 13)) +
labs(x = "", y = "Correlation values") +
geom_hline(yintercept = 0.7, linetype = "dashed", color = "red") +
geom_hline(yintercept = -0.7, linetype = "dashed", color = "blue")#frequencies
df %<>%
mutate(
range = factor(case_when(
(value >= -1 & value < -0.9) ~ "-1",
(value >= -0.9 & value < -0.8) ~ "-0.9",
(value >= -0.8 & value < -0.7) ~ "-0.8",
(value >= -0.7 & value < -0.6) ~ "-0.7",
(value >= -0.6 & value < -0.5) ~ "-0.6",
(value >= -0.5 & value < -0.4) ~ "-0.5",
(value >= -0.4 & value < -0.3) ~ "-0.4",
(value >= -0.3 & value < -0.2) ~ "-0.3",
(value >= -0.2 & value < -0.1) ~ "-0.2",
(value >= -0.1 & value < 0) ~ "-0.1",
(value >= 0 & value < 0.1) ~ "0.1",
(value >= 0.1 & value < 0.2) ~ "0.2",
(value >= 0.2 & value < 0.3) ~ "0.3",
(value >= 0.3 & value < 0.4) ~ "0.4",
(value >= 0.4 & value < 0.5) ~ "0.5",
(value >= 0.5 & value < 0.6) ~ "0.6",
(value >= 0.6 & value < 0.7) ~ "0.7",
(value >= 0.7 & value < 0.8) ~ "0.8",
(value >= 0.8 & value < 0.9) ~ "0.9",
(value >= 0.9 & value <= 1) ~ "1",
TRUE ~ NA_character_),
levels = c( "-1", "-0.9", "-0.8", "-0.7", "-0.6", "-0.5", "-0.4", "-0.3", "-0.2", "-0.1",
"0.1", "0.2", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9", "1"))) %>%
group_by(range, name) %>%
summarise(freq = n()) %>%
ungroup() %>%
mutate(barcol = case_when(name == "ctrl_cor" ~ "forestgreen",
name == "cold_cor" ~"deepskyblue"))
df %>%
ggplot(aes(y = ifelse(test = name == "cold_cor", yes = -freq, no = freq),
x = range, fill = name)) +
geom_col() +
scale_y_continuous() +
scale_fill_manual(values = unique(df$barcol)) +
labs(x = "Correlation", y = "Number of genes") +
theme(text = element_text(size = 12))Next, we identify the mfuzz clusters associated with our highly correlated genes.
#genes highly correlated with widths change in cold and control
goi_cold <- vst_avg %>% filter(cold_cor >= 0.7 | cold_cor <= -0.7) %>% .$ID
goi_ctrl <- vst_avg %>% filter(ctrl_cor >= 0.7 | ctrl_cor <= -0.7) %>% .$ID
#highly correlated genes that are only in cold (and not in control)
goi_cold_only <- goi_cold[!goi_cold %in% goi_ctrl]
#mfuzz clusters
clusts <- read_csv(here("data/mfuzz/clustermembership.csv"))
clusts <- clusts[,-1]
print("Top: mfuzz clusters /n Bottom: Number of highly correlated genes (cold only) with cell wall growth in that mfuzz cluster")## [1] "Top: mfuzz clusters /n Bottom: Number of highly correlated genes (cold only) with cell wall growth in that mfuzz cluster"
## .
## 1 2 3 4 5 7 8
## 2 6 7 1 1 1 1
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0 dplyr_1.1.2
## [5] purrr_1.0.1 readr_2.1.4 tidyr_1.3.0 tibble_3.2.1
## [9] tidyverse_2.0.0 magrittr_2.0.3 gplots_3.1.3 hyperSpec_0.100.0
## [13] xml2_1.3.5 ggplot2_3.4.2 lattice_0.21-8 here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.3 xfun_0.39 bslib_0.5.0
## [4] caTools_1.18.2 latticeExtra_0.6-30 tzdb_0.4.0
## [7] vctrs_0.6.3 tools_4.3.1 bitops_1.0-7
## [10] generics_0.1.3 parallel_4.3.1 fansi_1.0.4
## [13] highr_0.10 pkgconfig_2.0.3 KernSmooth_2.23-22
## [16] RColorBrewer_1.1-3 lifecycle_1.0.3 farver_2.1.1
## [19] compiler_4.3.1 deldir_1.0-9 brio_1.1.3
## [22] munsell_0.5.0 htmltools_0.5.5 sass_0.4.7
## [25] yaml_2.3.7 lazyeval_0.2.2 pillar_1.9.0
## [28] crayon_1.5.2 jquerylib_0.1.4 cachem_1.0.8
## [31] gtools_3.9.4 tidyselect_1.2.0 digest_0.6.33
## [34] stringi_1.7.12 labeling_0.4.2 rprojroot_2.0.3
## [37] fastmap_1.1.1 colorspace_2.1-0 cli_3.6.1
## [40] utf8_1.2.3 withr_2.5.0 scales_1.2.1
## [43] bit64_4.0.5 timechange_0.2.0 rmarkdown_2.23
## [46] jpeg_0.1-10 bit_4.0.5 interp_1.1-4
## [49] png_0.1-8 hms_1.1.3 evaluate_0.21
## [52] knitr_1.43 testthat_3.1.10 rlang_1.1.1
## [55] Rcpp_1.0.11 glue_1.6.2 rstudioapi_0.15.0
## [58] vroom_1.6.3 jsonlite_1.8.7 R6_2.5.1
Created by Aman Zare